{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "uA0d_STPW7za"
      },
      "outputs": [],
      "source": [
        "!pip -q install datasets transformers"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Loading the IMDB dataset"
      ],
      "metadata": {
        "id": "8Ud4E6TlXFXP"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from datasets import load_dataset\n",
        "imdb_dataset = load_dataset(\"imdb\")"
      ],
      "metadata": {
        "id": "GpXxbN_bXBfo"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Tokenize imdb Dataset"
      ],
      "metadata": {
        "id": "VCFJODgdXW33"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from datasets import load_dataset\n",
        "from transformers import AutoTokenizer\n",
        "\n",
        "# Load a small portion of the IMDB dataset (100 samples)\n",
        "imdb_dataset = load_dataset(\"imdb\", split=\"train[:100]\")\n",
        "\n",
        "# Initialize the tokenizer\n",
        "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
        "\n",
        "# Tokenize the IMDB dataset with truncation and padding\n",
        "tokenized_imdb_dataset = imdb_dataset.map(\n",
        "lambda x: tokenizer(x[\"text\"], truncation=True, padding=\"max_length\")\n",
        ")\n",
        "\n",
        "print(tokenized_imdb_dataset)\n",
        "\n",
        "# Get the first row of tokens\n",
        "first_row_tokens = tokenized_imdb_dataset[0][\"input_ids\"]\n",
        "\n",
        "# Print the first 10 tokens and their corresponding words\n",
        "for token in first_row_tokens[:10]:\n",
        "  print(f\"Token: {token}, Word: {tokenizer.decode([token])}\")"
      ],
      "metadata": {
        "id": "uFW5jLzJXe6d"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Spaces example code\n",
        "Set up Gradio interface"
      ],
      "metadata": {
        "id": "4sEIuAX1Y12I"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install gradio transformers"
      ],
      "metadata": {
        "id": "6b9qSiukY9kK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import gradio as gr\n",
        "from transformers import pipeline\n",
        "\n",
        "sentiment_pipeline = pipeline(\"sentiment-analysis\")\n",
        "\n",
        "def sentiment_analysis(text):\n",
        "    result = sentiment_pipeline(text)\n",
        "    return result[0][\"label\"]\n",
        "\n",
        "iface = gr.Interface(fn=sentiment_analysis, inputs=\"text\", outputs=\"text\")\n",
        "iface.launch()"
      ],
      "metadata": {
        "id": "MEvaRRUpY5tB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Elastic Eland"
      ],
      "metadata": {
        "id": "1821xMLhZRGw"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install eland"
      ],
      "metadata": {
        "id": "zeHEK6A-ZbA7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# TODO ADD INDEX CREATION EXAMPLE"
      ],
      "metadata": {
        "id": "wB_055jlaH2H"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Connect to Elasticsearch and create sample index"
      ],
      "metadata": {
        "id": "iLkd_1KWo6ba"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import getpass\n",
        "from datetime import datetime\n",
        "\n",
        "es_cloud_id = getpass.getpass('Enter Elastic Cloud ID:  ')\n",
        "es_api_key = getpass.getpass('Enter cluster API key:  ')\n",
        "\n",
        "es = Elasticsearch(cloud_id=es_cloud_id,\n",
        "                   api_key=es_api_key\n",
        "                   )\n",
        "es.info() # should return cluster info\n",
        "\n",
        "mapping = {\n",
        "    \"settings\": {\n",
        "        \"number_of_shards\": 1,\n",
        "        \"number_of_replicas\": 0\n",
        "    },\n",
        "    \"mappings\": {\n",
        "        \"properties\": {\n",
        "            \"some_field\": {\"type\": \"float\"},\n",
        "            \"column_a\": {\"type\": \"float\"},\n",
        "            \"column_b\": {\"type\": \"float\"},\n",
        "            \"category\": {\"type\": \"keyword\"},\n",
        "            \"value\": {\"type\": \"float\"}\n",
        "        }\n",
        "    }\n",
        "}\n",
        "\n",
        "# Create the index\n",
        "es.indices.create(index=\"sample_eland_index\", body=mapping)\n",
        "\n",
        "# Populate the index with a small dataset\n",
        "documents = [\n",
        "    {\"some_field\": 95.0, \"column_a\": 5.0, \"column_b\": 10.0, \"category\": \"A\", \"value\": 50.0},\n",
        "    {\"some_field\": 150.0, \"column_a\": 7.0, \"column_b\": 20.0, \"category\": \"B\", \"value\": 140.0},\n",
        "    {\"some_field\": 200.0, \"column_a\": 8.0, \"column_b\": 25.0, \"category\": \"A\", \"value\": 200.0},\n",
        "    {\"some_field\": 50.0, \"column_a\": 4.0, \"column_b\": 12.5, \"category\": \"C\", \"value\": 50.0}\n",
        "]\n",
        "\n",
        "for doc in documents:\n",
        "    es.index(index=\"sample_eland_index\", body=doc)\n",
        "\n"
      ],
      "metadata": {
        "id": "8vnErFw4o8eh"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Eland Examples"
      ],
      "metadata": {
        "id": "VJzKKd6opNLf"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import eland as ed\n",
        "\n",
        "df = ed.DataFrame(es_client=es, es_index_pattern=\"sample_eland_index\")\n",
        "filtered_df = df[df['some_field'] > 100]\n",
        "filtered_df"
      ],
      "metadata": {
        "id": "puzcRKXHZTeB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "average_value = df['some_field'].mean()\n",
        "average_value"
      ],
      "metadata": {
        "id": "kchuV7LmZ94y"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import seaborn as sns\n",
        "import pandas as pd\n",
        "\n",
        "filtered_df = df[df['some_field'] > 100]\n",
        "pandas_df = filtered_df.to_pandas()\n",
        "sns.boxplot(x='category', y='value', data=pandas_df)"
      ],
      "metadata": {
        "id": "OUt_BZrQaA1l"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Loading a Sentence Transformer from Hugging Face into Elasticsearch"
      ],
      "metadata": {
        "id": "9LyU11jZaK_s"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "pip -q install eland elasticsearch transformers sentence_transformers torch==1.13"
      ],
      "metadata": {
        "id": "ERArF6WFaNt6"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from pathlib import Path\n",
        "from eland.ml.pytorch import PyTorchModel\n",
        "from eland.ml.pytorch.transformers import TransformerModel\n",
        "from elasticsearch import Elasticsearch\n",
        "from elasticsearch.client import MlClient"
      ],
      "metadata": {
        "id": "WD-ajPzoaiCf"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import getpass"
      ],
      "metadata": {
        "id": "z1HM9y8namy8"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "es_cloud_id = getpass.getpass('Enter Elastic Cloud ID:  ')\n",
        "es_api_key = getpass.getpass('Enter cluster API key:  ')"
      ],
      "metadata": {
        "id": "8CboLpvAatQv"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "es = Elasticsearch(cloud_id=es_cloud_id,\n",
        "                   api_key=es_api_key\n",
        "                   )\n",
        "es.info() # should return cluster info"
      ],
      "metadata": {
        "id": "NlNW7YKJau7O"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "hf_model_id='sentence-transformers/msmarco-MiniLM-L-12-v3'\n",
        "tm = TransformerModel(model_id="hf_model_id, task_type=\"text_embedding\")"
      ],
      "metadata": {
        "id": "hSyXy5aJa10s"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "es_model_id = tm.elasticsearch_model_id()\n",
        "es_model_id"
      ],
      "metadata": {
        "id": "I4MsWWhga4CX"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "tmp_path = \"models\"\n",
        "Path(tmp_path).mkdir(parents=True, exist_ok=True)\n",
        "model_path, config, vocab_path = tm.save(tmp_path)"
      ],
      "metadata": {
        "id": "eKtX28bTa50r"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "ptm = PyTorchModel(es, es_model_id)\n",
        "ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config)"
      ],
      "metadata": {
        "id": "nI3Ppm9ea7vW"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# List the in Elasticsearch\n",
        "m = MlClient.get_trained_models(es, model_id=es_model_id)\n",
        "m.body"
      ],
      "metadata": {
        "id": "AvRE1GPwa9aZ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "s = MlClient.start_trained_model_deployment(es, model_id=es_model_id)\n",
        "s.body"
      ],
      "metadata": {
        "id": "_L1rccH1a-zu"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "stats = MlClient.get_trained_models_stats(es, model_id=es_model_id)\n",
        "stats.body['trained_model_stats'][0]['deployment_stats']['nodes'][0]['routing_state']"
      ],
      "metadata": {
        "id": "-BQB368zbAyT"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "docs =  [\n",
        "    {\n",
        "      \"text_field\": \"Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.\"\n",
        "    }\n",
        "  ]"
      ],
      "metadata": {
        "id": "ZELyEJvUbLOW"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "z = MlClient.infer_trained_model(es, model_id=es_model_id, docs=docs, )"
      ],
      "metadata": {
        "id": "hZ8U1oHGbM4l"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "doc_0_vector = z['inference_results'][0]['predicted_value']\n",
        "doc_0_vector"
      ],
      "metadata": {
        "id": "5q3j1WDmbNRT"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Reducing Dimensionality"
      ],
      "metadata": {
        "id": "UJT9EOxmn6pZ"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "from sklearn import datasets\n",
        "from sklearn.decomposition import PCA"
      ],
      "metadata": {
        "id": "Yh5QvWj-n9Ri"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Load the Iris dataset\n",
        "iris = datasets.load_iris()\n",
        "X = iris.data\n",
        "y = iris.target\n",
        "\n",
        "# Apply PCA for dimensionality reduction\n",
        "pca = PCA(n_components=2)\n",
        "X_reduced = pca.fit_transform(X)"
      ],
      "metadata": {
        "id": "TyYMdCXNoChF"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Visualize the original data\n",
        "plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, edgecolor='k')\n",
        "plt.xlabel('Sepal length')\n",
        "plt.ylabel('Sepal width')\n",
        "plt.title('Original Iris dataset')\n",
        "plt.show()"
      ],
      "metadata": {
        "id": "HgQwUpBJoFIg"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Visualize the reduced data\n",
        "plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap=plt.cm.Set1, edgecolor='k')\n",
        "plt.xlabel('First Principal Component')\n",
        "plt.ylabel('Second Principal Component')\n",
        "plt.title('Iris dataset after PCA')\n",
        "plt.show()\n"
      ],
      "metadata": {
        "id": "w2x7VthJoIMu"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "#Quantization"
      ],
      "metadata": {
        "id": "eUluTeh2oXYX"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "from sklearn import datasets\n",
        "from sklearn.decomposition import PCA\n",
        "from sklearn.preprocessing import MinMaxScaler, QuantileTransformer\n",
        ""
      ],
      "metadata": {
        "id": "RPY1tGMvoY7L"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Load the digits dataset\n",
        "digits = datasets.load_digits()\n",
        "X = digits.data\n",
        "\n",
        "# Print the first example from the original dataset\n",
        "print(\"Original dataset (first example):\\n\", X[0])"
      ],
      "metadata": {
        "id": "1zUhu904oede"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Apply PCA for dimensionality reduction\n",
        "pca = PCA(n_components=10)\n",
        "X_reduced = pca.fit_transform(X)\n",
        "\n",
        "# Print the first example after PCA\n",
        "print(\"\\nReduced dataset after PCA (first example):\\n\", X_reduced[0])"
      ],
      "metadata": {
        "id": "xWSyzBeqoltn"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Normalize the reduced vectors to the range [0, 255]\n",
        "scaler = MinMaxScaler((0, 255))\n",
        "X_scaled = scaler.fit_transform(X_reduced)\n",
        "\n",
        "# Print the first example after normalization\n",
        "print(\"\\nScaled dataset after normalization (first example):\\n\", X_scaled[0])"
      ],
      "metadata": {
        "id": "AcNn_VWqon0t"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Quantize the scaled vectors to 8-byte integers\n",
        "X_quantized = np.round(X_scaled).astype(np.uint8)\n",
        "\n",
        "# Print the first example after quantization\n",
        "print(\"\\nQuantized dataset (first example):\\n\", X_quantized[0])"
      ],
      "metadata": {
        "id": "B-R0I53hoqNj"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}
